library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(patchwork)
library(treemapify)
library(reshape2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.3     v purrr   0.3.2
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x plyr::arrange()   masks dplyr::arrange()
## x purrr::compact()  masks plyr::compact()
## x plyr::count()     masks dplyr::count()
## x plyr::failwith()  masks dplyr::failwith()
## x dplyr::filter()   masks stats::filter()
## x plyr::id()        masks dplyr::id()
## x dplyr::lag()      masks stats::lag()
## x plyr::mutate()    masks dplyr::mutate()
## x plyr::rename()    masks dplyr::rename()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
library(plotly)
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
## The following object is masked from 'package:plyr':
## 
##     ozone
usa_deaths_states = read.csv("data/death-causes-usa.csv", sep=";") %>% filter(Cause.Name != "All causes")
spain_deaths = read.csv("data/death-causes-spain-2017-modified.csv", sep=";",fileEncoding="UTF-8-BOM")

USA data set

The data published by the Centers for Disease Control and Prevention was gathered by the National Center for Health Statistics (NCHS), with the last revision being made in 2017. This data set contains the information of the 10 leading causes of death in the United States. The data is based on information from resident death certificates filed in the 50 states and the District of Columbia using demographic and medical characteristics. The data set holds 10868 observations, each with the following 6 features:

dim(usa_deaths_states)
## [1] 9880    6
head(usa_deaths_states)
##   Year                                      X113.Cause.Name
## 1 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 2 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 3 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 4 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 5 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 6 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
##               Cause.Name         State Deaths Age.adjusted.Death.Rate
## 1 Unintentional injuries United States 169936                    49.4
## 2 Unintentional injuries       Alabama   2703                    53.8
## 3 Unintentional injuries        Alaska    436                    63.7
## 4 Unintentional injuries       Arizona   4184                    56.2
## 5 Unintentional injuries      Arkansas   1625                    51.8
## 6 Unintentional injuries    California  13840                    33.2

Spanish data set

The second data set details the causes of death in Spain for the year 2017. It was found on the government website, https://datos.gob.es/, although since the original project proposal is no longer available at the found location. This data was collected as part of a study done by the Spanish Institute of Statistics (INE). Due to the layout of the data it was required to transform it so it could be processed. During this process the list of disease types were reduced by combining multiple of the same type to one so it was compared to the USA data set. An example of this is the Spanish data set has 30 types of cancer listed, while the USA set has 1, so the totals for the Spanish set were totaled under the name “Cancer”. The resulting data consists of 1056 observations with the following 4 features:

* **DISEASE**: Name of cause of death
* **GENDER**: The genender of the people represented by the observation, includes Males, Females and Both (total of both males and                 females)
* **AGE**: The age range of the people who died in the observation
* **NUMBER.OF.DEATHS**: The number of people that died
dim(spain_deaths)
## [1] 1056    4
head(spain_deaths)
##      DISEASE  GENDER      AGE NUMBER.OF.DEATHS
## 1 All causes    Both All ages           424523
## 2 All causes   Males All ages           214236
## 3 All causes Females All ages           210287
## 4 All causes    Both   0 to 1             1092
## 5 All causes   Males   0 to 1              619
## 6 All causes Females   0 to 1              473

As we see, we have different data for both countries, so the comparison will be tough. We will first filter for the country data, filtering state data in the US data set in order to view the country as a whole.

usa_deaths = usa_deaths_states %>% filter(State == "United States")
head(usa_deaths)
##   Year                                      X113.Cause.Name
## 1 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 2 2017                            Alzheimer's disease (G30)
## 3 2017                   Cerebrovascular diseases (I60-I69)
## 4 2017         Chronic lower respiratory diseases (J40-J47)
## 5 2017                          Diabetes mellitus (E10-E14)
## 6 2017          Diseases of heart (I00-I09,I11,I13,I20-I51)
##               Cause.Name         State Deaths Age.adjusted.Death.Rate
## 1 Unintentional injuries United States 169936                    49.4
## 2    Alzheimer's disease United States 121404                    31.0
## 3                 Stroke United States 146383                    37.6
## 4                   CLRD United States 160201                    40.9
## 5               Diabetes United States  83564                    21.5
## 6          Heart disease United States 647457                   165.0
area_plot <- ggplot(usa_deaths, aes(x=Year, y=Age.adjusted.Death.Rate, fill=Cause.Name)) + 
  labs(title = "Trend of death causes", x = "", y = "Death Rate / 100.000 (Age Adjusted)", fill = "Causes") +
  scale_color_brewer(palette = "Paired") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal() +
  geom_area()

line_plot <- ggplot(usa_deaths, aes(x=Year, y=log(Deaths), color=Cause.Name)) + 
  labs(title = "Changes on death causes over the years", x = "", y = "log(Death Rate)", fill = "Causes") +
  scale_color_brewer(palette = "Paired") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal() +
  geom_line() 

ggplotly(area_plot)
ggplotly(line_plot)
#Get data about the top 10 diseases in the Spanish data set. Ignore "Other causes"
topTenSpanishDiseases<- spain_deaths %>% filter(GENDER == "Both") %>% filter(AGE=="All ages") %>% filter(DISEASE!="All causes") %>% filter(DISEASE!="Other causes") %>% top_n(10, NUMBER.OF.DEATHS)

#Generate tree map
ggplot(topTenSpanishDiseases, aes(area=NUMBER.OF.DEATHS, fill=DISEASE, label=NUMBER.OF.DEATHS)) +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "Most deadly diseases in Spain (2017)", fill = "Causes") +
  geom_treemap() +
  geom_treemap_text(fontface = "italic",
                    colour = "white",
                    place = "centre",
                    grow = FALSE,
                    reflow = TRUE) 

topTenSpanishDiseasesByGenre <- spain_deaths %>% filter(GENDER != "Both") %>% filter(DISEASE != "All causes") %>% filter(AGE == "All ages")      %>%  filter(DISEASE %in% topTenSpanishDiseases$DISEASE)

topTenSpanishDiseasesByGenre$DISEASE  <- with(topTenSpanishDiseasesByGenre, reorder(DISEASE, NUMBER.OF.DEATHS))

barplot <- ggplot(topTenSpanishDiseasesByGenre, aes(fill=DISEASE, y=NUMBER.OF.DEATHS, x=GENDER)) + 
  scale_fill_brewer(palette = "Paired") +
    geom_bar(position=position_stack(), stat="identity", width=0.4) +
  labs(x="", y = "Number of deaths (2017)")+
  theme_minimal()

barplot

ggplotly(barplot) %>% layout(bargap=0.1) #messes with the legend
barplot <- ggplot(topTenSpanishDiseasesByGenre, aes(x=GENDER, y=NUMBER.OF.DEATHS, fill=DISEASE)) + 
  geom_col(colour="black",width=0.3,    
           position=position_stack()) +
  scale_fill_brewer(palette = "Paired") +
  labs(x="", y = "Number of deaths (2017)")+
  theme_minimal()

barplot

ggplotly(barplot) %>% layout(legend = list()) #messes with the legend
#Create data frame to hold data about people who have Diseases of the circulatory system
diseasesCircSystemData <- spain_deaths

#Set up AGE factor for pyramid plot
diseasesCircSystemData$AGE <- factor(diseasesCircSystemData$AGE, c("0 to 1", "1 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 or more", "All ages"))

#Filter data to create table about only circular system disease 
diseasesCircSystemData <- diseasesCircSystemData %>% filter(DISEASE == "Diseases of the circulatory system") %>% filter(GENDER != "Both") %>% filter(AGE != "All ages") %>% modify_at("DISEASE",~NULL)

summary(diseasesCircSystemData)
##      GENDER         AGE     NUMBER.OF.DEATHS 
##  Both   : 0   0 to 1  : 2   Min.   :    0.0  
##  Females:21   1 to 4  : 2   1st Qu.:   14.5  
##  Males  :21   5 to 9  : 2   Median :  348.5  
##               10 to 14: 2   Mean   : 2274.5  
##               15 to 19: 2   3rd Qu.: 2880.8  
##               20 to 24: 2   Max.   :13648.0  
##               (Other) :30
#Mutate Male data so it will be negative on bargraph
diseasesCircSystemData <- diseasesCircSystemData %>% mutate(NUMBER.OF.DEATHS = ifelse(GENDER == "Males", -1 * NUMBER.OF.DEATHS, NUMBER.OF.DEATHS))

#Generate pyramid plot
ggplot(diseasesCircSystemData, aes(x = AGE, y = NUMBER.OF.DEATHS, fill = GENDER)) + 
  geom_bar(data=diseasesCircSystemData[diseasesCircSystemData$GENDER == "Females",], stat = "identity") + 
  geom_bar(data=diseasesCircSystemData[diseasesCircSystemData$GENDER == "Males",], stat = "identity") + 
  scale_y_continuous(breaks = seq(-15000, 15000, 5000), 
                     labels = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "")) +
  coord_flip(ylim=c(-15000,15000)) + 
  scale_fill_brewer(palette = "Set1") + 
  labs(title = "Number of people who die from Diseases of the circulatory system", y = "Number of deaths (1,000)", x = "Age", fill = "Gender") +
  theme_bw()

unique(usa_deaths_states$State)
##  [1] United States        Alabama              Alaska              
##  [4] Arizona              Arkansas             California          
##  [7] Colorado             Connecticut          Delaware            
## [10] District of Columbia Florida              Georgia             
## [13] Hawaii               Idaho                Illinois            
## [16] Indiana              Iowa                 Kansas              
## [19] Kentucky             Louisiana            Maine               
## [22] Maryland             Massachusetts        Michigan            
## [25] Minnesota            Mississippi          Missouri            
## [28] Montana              Nebraska             Nevada              
## [31] New Hampshire        New Jersey           New Mexico          
## [34] New York             North Carolina       North Dakota        
## [37] Ohio                 Oklahoma             Oregon              
## [40] Pennsylvania         Rhode Island         South Carolina      
## [43] South Dakota         Tennessee            Texas               
## [46] Utah                 Vermont              Virginia            
## [49] Washington           West Virginia        Wisconsin           
## [52] Wyoming             
## 52 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming
unique(usa_deaths_states$Cause.Name)
##  [1] Unintentional injuries  Alzheimer's disease    
##  [3] Stroke                  CLRD                   
##  [5] Diabetes                Heart disease          
##  [7] Influenza and pneumonia Suicide                
##  [9] Cancer                  Kidney disease         
## 11 Levels: All causes Alzheimer's disease Cancer CLRD ... Unintentional injuries
data <- usa_deaths_states
data <- usa_deaths_states %>% filter(Year == 2017) %>% filter(State != "United States")

data <- left_join(data, aggregate(data$Deaths, by=list(State=data$State), FUN=sum), by="State")


data <- data %>% filter(Year == 2017) %>% filter(State != "United States") %>% filter(Cause.Name == "Suicide")





data$region <- tolower(data$State)

states <- map_data("state")
suicide_map <- left_join(states, data, by = "region")

death_map <- ggplot(data = suicide_map) + 
  geom_polygon(aes(x = long, y = lat, fill = Deaths/x, group = group), color = "white") + 
  coord_fixed(1.3) +
  labs(x="", y="") +
  theme_void() +
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "white")) +
    scale_fill_continuous(high = "#132B43", low = "#56B1F7") +
  guides(fill=FALSE)  # do this to leave off the color legend

death_adjusted_map <- ggplot(data = suicide_map) + 
  geom_polygon(aes(x = long, y = lat, fill = Age.adjusted.Death.Rate, group = group), color = "white") + 
  coord_fixed(1.3) +
  labs(x="", y="") +
  theme_void() +
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "white")) +
    scale_fill_continuous(high = "#132B43", low = "#56B1F7") +
  guides(fill=FALSE)  # do this to leave off the color legend

ggplotly(death_map)
ggplotly(death_adjusted_map)